In [2]:
# Author: Stephen Situ
# Apache Spark is an open-source unified analytics engine for large-scale data processing. Using parallel computing, it allows us to
# process data at speeds needed for machine learning, AI, and big data. This is a practice project on using Pyspark and Spark SQL
# in Python. We get familiar with spark dataframes and try to see how they differ from pandas dataframes. Additionally, we practice
# using Spark SQL to run SQL queries on Spark dataframes. 
In [ ]:
# Import libraries
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
In [3]:
# Create spark session
spark = SparkSession.builder.appName('Test').getOrCreate()
In [4]:
# Check spark session is running
spark
Out[4]:

SparkSession - in-memory

SparkContext

Spark UI

Version
v3.2.1
Master
local[*]
AppName
Test
In [6]:
# Read CSV
df = spark.read.csv('netflix1.csv')
df.show()
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+
|    _c0|    _c1|                 _c2|                 _c3|           _c4|       _c5|         _c6|   _c7|      _c8|                 _c9|
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+
|show_id|   type|               title|            director|       country|date_added|release_year|rating| duration|           listed_in|
|     s1|  Movie|Dick Johnson Is Dead|     Kirsten Johnson| United States| 9/25/2021|        2020| PG-13|   90 min|       Documentaries|
|     s3|TV Show|           Ganglands|     Julien Leclercq|        France| 9/24/2021|        2021| TV-MA| 1 Season|Crime TV Shows, I...|
|     s6|TV Show|       Midnight Mass|       Mike Flanagan| United States| 9/24/2021|        2021| TV-MA| 1 Season|TV Dramas, TV Hor...|
|    s14|  Movie|Confessions of an...|       Bruno Garotti|        Brazil| 9/22/2021|        2021| TV-PG|   91 min|Children & Family...|
|     s8|  Movie|             Sankofa|        Haile Gerima| United States| 9/24/2021|        1993| TV-MA|  125 min|Dramas, Independe...|
|     s9|TV Show|The Great British...|     Andy Devonshire|United Kingdom| 9/24/2021|        2021| TV-14|9 Seasons|British TV Shows,...|
|    s10|  Movie|        The Starling|      Theodore Melfi| United States| 9/24/2021|        2021| PG-13|  104 min|    Comedies, Dramas|
|   s939|  Movie|Motu Patlu in the...|         Suhas Kadav|         India|  5/1/2021|        2019| TV-Y7|   87 min|Children & Family...|
|    s13|  Movie|        Je Suis Karl| Christian Schwochow|       Germany| 9/23/2021|        2021| TV-MA|  127 min|Dramas, Internati...|
|   s940|  Movie|Motu Patlu in Won...|         Suhas Kadav|         India|  5/1/2021|        2013| TV-Y7|   76 min|Children & Family...|
|   s941|  Movie|Motu Patlu: Deep ...|         Suhas Kadav|         India|  5/1/2021|        2014| TV-Y7|   76 min|Children & Family...|
|   s942|  Movie|Motu Patlu: Missi...|         Suhas Kadav|         India|  5/1/2021|        2013| TV-Y7|   71 min|Children & Family...|
|   s852|  Movie|    99 Songs (Tamil)|           Not Given|      Pakistan| 5/21/2021|        2021| TV-14|  131 min|Dramas, Internati...|
|   s471|  Movie|Bridgerton - The ...|Krysia Plonka, Kr...| United States| 7/13/2021|        2021| TV-14|   39 min|              Movies|
|   s730|  Movie|Bling Empire - Th...|Krysia Plonka, Kr...| United States| 6/12/2021|        2021| TV-MA|   36 min|              Movies|
|   s731|  Movie|Cobra Kai - The A...|Krysia Plonka, Kr...| United States| 6/12/2021|        2021| TV-MA|   34 min|              Movies|
|   s913|  Movie|The Circle - The ...|Krysia Plonka, Kr...| United States|  5/7/2021|        2021| TV-14|   35 min|            Comedies|
|     s4|TV Show|Jailbirds New Orl...|           Not Given|      Pakistan| 9/24/2021|        2021| TV-MA| 1 Season|Docuseries, Reali...|
|    s15|TV Show|Crime Stories: In...|           Not Given|      Pakistan| 9/22/2021|        2021| TV-MA| 1 Season|British TV Shows,...|
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+
only showing top 20 rows

In [13]:
# Read CSV with option header = True & inferSchema = True
df = spark.read.csv('netflix1.csv',header=True,inferSchema=True)
df.show()
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+
|show_id|   type|               title|            director|       country|date_added|release_year|rating| duration|           listed_in|
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|     Kirsten Johnson| United States| 9/25/2021|        2020| PG-13|   90 min|       Documentaries|
|     s3|TV Show|           Ganglands|     Julien Leclercq|        France| 9/24/2021|        2021| TV-MA| 1 Season|Crime TV Shows, I...|
|     s6|TV Show|       Midnight Mass|       Mike Flanagan| United States| 9/24/2021|        2021| TV-MA| 1 Season|TV Dramas, TV Hor...|
|    s14|  Movie|Confessions of an...|       Bruno Garotti|        Brazil| 9/22/2021|        2021| TV-PG|   91 min|Children & Family...|
|     s8|  Movie|             Sankofa|        Haile Gerima| United States| 9/24/2021|        1993| TV-MA|  125 min|Dramas, Independe...|
|     s9|TV Show|The Great British...|     Andy Devonshire|United Kingdom| 9/24/2021|        2021| TV-14|9 Seasons|British TV Shows,...|
|    s10|  Movie|        The Starling|      Theodore Melfi| United States| 9/24/2021|        2021| PG-13|  104 min|    Comedies, Dramas|
|   s939|  Movie|Motu Patlu in the...|         Suhas Kadav|         India|  5/1/2021|        2019| TV-Y7|   87 min|Children & Family...|
|    s13|  Movie|        Je Suis Karl| Christian Schwochow|       Germany| 9/23/2021|        2021| TV-MA|  127 min|Dramas, Internati...|
|   s940|  Movie|Motu Patlu in Won...|         Suhas Kadav|         India|  5/1/2021|        2013| TV-Y7|   76 min|Children & Family...|
|   s941|  Movie|Motu Patlu: Deep ...|         Suhas Kadav|         India|  5/1/2021|        2014| TV-Y7|   76 min|Children & Family...|
|   s942|  Movie|Motu Patlu: Missi...|         Suhas Kadav|         India|  5/1/2021|        2013| TV-Y7|   71 min|Children & Family...|
|   s852|  Movie|    99 Songs (Tamil)|           Not Given|      Pakistan| 5/21/2021|        2021| TV-14|  131 min|Dramas, Internati...|
|   s471|  Movie|Bridgerton - The ...|Krysia Plonka, Kr...| United States| 7/13/2021|        2021| TV-14|   39 min|              Movies|
|   s730|  Movie|Bling Empire - Th...|Krysia Plonka, Kr...| United States| 6/12/2021|        2021| TV-MA|   36 min|              Movies|
|   s731|  Movie|Cobra Kai - The A...|Krysia Plonka, Kr...| United States| 6/12/2021|        2021| TV-MA|   34 min|              Movies|
|   s913|  Movie|The Circle - The ...|Krysia Plonka, Kr...| United States|  5/7/2021|        2021| TV-14|   35 min|            Comedies|
|     s4|TV Show|Jailbirds New Orl...|           Not Given|      Pakistan| 9/24/2021|        2021| TV-MA| 1 Season|Docuseries, Reali...|
|    s15|TV Show|Crime Stories: In...|           Not Given|      Pakistan| 9/22/2021|        2021| TV-MA| 1 Season|British TV Shows,...|
|  s3232|  Movie| True: Winter Wishes|Mark Thornton, To...| United States|11/26/2019|        2019|  TV-Y|   46 min|Children & Family...|
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+
only showing top 20 rows

In [14]:
# Print Schema
df.printSchema()
root
 |-- show_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- title: string (nullable = true)
 |-- director: string (nullable = true)
 |-- country: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- listed_in: string (nullable = true)

In [15]:
# Print Columns
df.columns
Out[15]:
['show_id',
 'type',
 'title',
 'director',
 'country',
 'date_added',
 'release_year',
 'rating',
 'duration',
 'listed_in']
In [20]:
# Print Head, top 5
df.head(5)
Out[20]:
[Row(show_id='s1', type='Movie', title='Dick Johnson Is Dead', director='Kirsten Johnson', country='United States', date_added='9/25/2021', release_year='2020', rating='PG-13', duration='90 min', listed_in='Documentaries'),
 Row(show_id='s3', type='TV Show', title='Ganglands', director='Julien Leclercq', country='France', date_added='9/24/2021', release_year='2021', rating='TV-MA', duration='1 Season', listed_in='Crime TV Shows, International TV Shows, TV Action & Adventure'),
 Row(show_id='s6', type='TV Show', title='Midnight Mass', director='Mike Flanagan', country='United States', date_added='9/24/2021', release_year='2021', rating='TV-MA', duration='1 Season', listed_in='TV Dramas, TV Horror, TV Mysteries'),
 Row(show_id='s14', type='Movie', title='Confessions of an Invisible Girl', director='Bruno Garotti', country='Brazil', date_added='9/22/2021', release_year='2021', rating='TV-PG', duration='91 min', listed_in='Children & Family Movies, Comedies'),
 Row(show_id='s8', type='Movie', title='Sankofa', director='Haile Gerima', country='United States', date_added='9/24/2021', release_year='1993', rating='TV-MA', duration='125 min', listed_in='Dramas, Independent Movies, International Movies')]
In [23]:
# Print Data types
df.dtypes
Out[23]:
[('show_id', 'string'),
 ('type', 'string'),
 ('title', 'string'),
 ('director', 'string'),
 ('country', 'string'),
 ('date_added', 'string'),
 ('release_year', 'string'),
 ('rating', 'string'),
 ('duration', 'string'),
 ('listed_in', 'string')]
In [35]:
# describe
df.describe().show()
+-------+----------------+-------------+---------------------------------+------------+--------+----------+------------------+--------------------+--------+------------------+
|summary|         show_id|         type|                            title|    director| country|date_added|      release_year|              rating|duration|         listed_in|
+-------+----------------+-------------+---------------------------------+------------+--------+----------+------------------+--------------------+--------+------------------+
|  count|            8791|         8791|                             8791|        8790|    8790|      8790|              8790|                8790|    8789|              8789|
|   mean|            null|         null|               1124.7692307692307|        null|  1944.0|      null|2014.1911480259416|                null|    null|              null|
| stddev|            null|         null|               1042.1800991068478|        null|    null|      null|  8.79415428974682|                null|    null|              null|
|    min|Flying Fortress"|        Movie|                           #Alive|   3/31/2017|    1944|  1/1/2008|              1925|Classic Movies, D...|1 Season|Action & Adventure|
|    max|            s999|William Wyler|최강전사 미니특공대 : 영웅의 탄생|Şenol Sönmez|Zimbabwe|     TV-PG|            40 min|                  UR|  99 min|         Thrillers|
+-------+----------------+-------------+---------------------------------+------------+--------+----------+------------------+--------------------+--------+------------------+

In [26]:
# Selecting a column "type"
df.select('type').show()
+-------+
|   type|
+-------+
|  Movie|
|TV Show|
|TV Show|
|  Movie|
|  Movie|
|TV Show|
|  Movie|
|  Movie|
|  Movie|
|  Movie|
|  Movie|
|  Movie|
|  Movie|
|  Movie|
|  Movie|
|  Movie|
|  Movie|
|TV Show|
|TV Show|
|  Movie|
+-------+
only showing top 20 rows

In [29]:
# Selecting Multiple columns
df.select(['type','country']).show()
+-------+--------------+
|   type|       country|
+-------+--------------+
|  Movie| United States|
|TV Show|        France|
|TV Show| United States|
|  Movie|        Brazil|
|  Movie| United States|
|TV Show|United Kingdom|
|  Movie| United States|
|  Movie|         India|
|  Movie|       Germany|
|  Movie|         India|
|  Movie|         India|
|  Movie|         India|
|  Movie|      Pakistan|
|  Movie| United States|
|  Movie| United States|
|  Movie| United States|
|  Movie| United States|
|TV Show|      Pakistan|
|TV Show|      Pakistan|
|  Movie| United States|
+-------+--------------+
only showing top 20 rows

In [36]:
# Adding a column "New Year"
test = df.withColumn('New Year',df['release_year']+1)
test.select(['New Year','release_year']).show()
+--------+------------+
|New Year|release_year|
+--------+------------+
|  2021.0|        2020|
|  2022.0|        2021|
|  2022.0|        2021|
|  2022.0|        2021|
|  1994.0|        1993|
|  2022.0|        2021|
|  2022.0|        2021|
|  2020.0|        2019|
|  2022.0|        2021|
|  2014.0|        2013|
|  2015.0|        2014|
|  2014.0|        2013|
|  2022.0|        2021|
|  2022.0|        2021|
|  2022.0|        2021|
|  2022.0|        2021|
|  2022.0|        2021|
|  2022.0|        2021|
|  2022.0|        2021|
|  2020.0|        2019|
+--------+------------+
only showing top 20 rows

In [46]:
# Renaming "New Year" column to "Great Year"
test1 = test.withColumnRenamed('New Year','Great Year')
test1.columns
Out[46]:
['show_id',
 'type',
 'title',
 'director',
 'country',
 'date_added',
 'release_year',
 'rating',
 'duration',
 'listed_in',
 'Great Year']
In [49]:
# Dropping a column
test1 = test1.drop('Great Year')
test1.columns
Out[49]:
['show_id',
 'type',
 'title',
 'director',
 'country',
 'date_added',
 'release_year',
 'rating',
 'duration',
 'listed_in']
In [53]:
# Drop all N/A's  
df.na.drop().show()
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+
|show_id|   type|               title|            director|       country|date_added|release_year|rating| duration|           listed_in|
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|     Kirsten Johnson| United States| 9/25/2021|        2020| PG-13|   90 min|       Documentaries|
|     s3|TV Show|           Ganglands|     Julien Leclercq|        France| 9/24/2021|        2021| TV-MA| 1 Season|Crime TV Shows, I...|
|     s6|TV Show|       Midnight Mass|       Mike Flanagan| United States| 9/24/2021|        2021| TV-MA| 1 Season|TV Dramas, TV Hor...|
|    s14|  Movie|Confessions of an...|       Bruno Garotti|        Brazil| 9/22/2021|        2021| TV-PG|   91 min|Children & Family...|
|     s8|  Movie|             Sankofa|        Haile Gerima| United States| 9/24/2021|        1993| TV-MA|  125 min|Dramas, Independe...|
|     s9|TV Show|The Great British...|     Andy Devonshire|United Kingdom| 9/24/2021|        2021| TV-14|9 Seasons|British TV Shows,...|
|    s10|  Movie|        The Starling|      Theodore Melfi| United States| 9/24/2021|        2021| PG-13|  104 min|    Comedies, Dramas|
|   s939|  Movie|Motu Patlu in the...|         Suhas Kadav|         India|  5/1/2021|        2019| TV-Y7|   87 min|Children & Family...|
|    s13|  Movie|        Je Suis Karl| Christian Schwochow|       Germany| 9/23/2021|        2021| TV-MA|  127 min|Dramas, Internati...|
|   s940|  Movie|Motu Patlu in Won...|         Suhas Kadav|         India|  5/1/2021|        2013| TV-Y7|   76 min|Children & Family...|
|   s941|  Movie|Motu Patlu: Deep ...|         Suhas Kadav|         India|  5/1/2021|        2014| TV-Y7|   76 min|Children & Family...|
|   s942|  Movie|Motu Patlu: Missi...|         Suhas Kadav|         India|  5/1/2021|        2013| TV-Y7|   71 min|Children & Family...|
|   s852|  Movie|    99 Songs (Tamil)|           Not Given|      Pakistan| 5/21/2021|        2021| TV-14|  131 min|Dramas, Internati...|
|   s471|  Movie|Bridgerton - The ...|Krysia Plonka, Kr...| United States| 7/13/2021|        2021| TV-14|   39 min|              Movies|
|   s730|  Movie|Bling Empire - Th...|Krysia Plonka, Kr...| United States| 6/12/2021|        2021| TV-MA|   36 min|              Movies|
|   s731|  Movie|Cobra Kai - The A...|Krysia Plonka, Kr...| United States| 6/12/2021|        2021| TV-MA|   34 min|              Movies|
|   s913|  Movie|The Circle - The ...|Krysia Plonka, Kr...| United States|  5/7/2021|        2021| TV-14|   35 min|            Comedies|
|     s4|TV Show|Jailbirds New Orl...|           Not Given|      Pakistan| 9/24/2021|        2021| TV-MA| 1 Season|Docuseries, Reali...|
|    s15|TV Show|Crime Stories: In...|           Not Given|      Pakistan| 9/22/2021|        2021| TV-MA| 1 Season|British TV Shows,...|
|  s3232|  Movie| True: Winter Wishes|Mark Thornton, To...| United States|11/26/2019|        2019|  TV-Y|   46 min|Children & Family...|
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+
only showing top 20 rows

In [ ]:
# Other Drop Variations
# drop any N/A's
# df.na.drop(how="any").show()
# drop only if all values are null
# df.na.drop(how="all").show()
# drop if 2 or more N/A's
# df.na.drop(how="any",thresh=2).show()
# drop N/A's in certain columns
# df.na.drop(how="any",subset=['director']).show()
In [ ]:
# Filling N/A values
# df.na.fill('Missing Values').show()
# df.na.fill('Missing Values',['A','B']).show()
# Use from pyspark.ml.feature import Imputer to do imputation
In [79]:
# Filtering functions
df.filter(df['country']=="Pakistan").show()
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+
|show_id|   type|               title| director| country|date_added|release_year|rating| duration|           listed_in|
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+
|   s852|  Movie|    99 Songs (Tamil)|Not Given|Pakistan| 5/21/2021|        2021| TV-14|  131 min|Dramas, Internati...|
|     s4|TV Show|Jailbirds New Orl...|Not Given|Pakistan| 9/24/2021|        2021| TV-MA| 1 Season|Docuseries, Reali...|
|    s15|TV Show|Crime Stories: In...|Not Given|Pakistan| 9/22/2021|        2021| TV-MA| 1 Season|British TV Shows,...|
|    s20|TV Show|              Jaguar|Not Given|Pakistan| 9/22/2021|        2021| TV-MA| 1 Season|International TV ...|
|    s32|TV Show|  Chicago Party Aunt|Not Given|Pakistan| 9/17/2021|        2021| TV-MA| 1 Season|         TV Comedies|
|    s34|TV Show|          Squid Game|Not Given|Pakistan| 9/17/2021|        2021| TV-MA| 1 Season|International TV ...|
|    s35|TV Show|Tayo and Little W...|Not Given|Pakistan| 9/17/2021|        2020| TV-Y7| 1 Season|            Kids' TV|
|    s75|TV Show|The World's Most ...|Not Given|Pakistan| 9/14/2021|        2021| TV-PG|2 Seasons|          Reality TV|
|    s84|TV Show|  Metal Shop Masters|Not Given|Pakistan| 9/10/2021|        2021| TV-MA| 1 Season|          Reality TV|
|    s86|TV Show|Pokémon Master Jo...|Not Given|Pakistan| 9/10/2021|        2021| TV-Y7| 1 Season|Anime Series, Kid...|
|    s88|TV Show|       Titipo Titipo|Not Given|Pakistan| 9/10/2021|        2019|  TV-Y|2 Seasons|Kids' TV, Korean ...|
|    s90|TV Show|         Mighty Raju|Not Given|Pakistan|  9/9/2021|        2017| TV-Y7|4 Seasons|            Kids' TV|
|   s101|TV Show|Tobot Galaxy Dete...|Not Given|Pakistan|  9/7/2021|        2019| TV-Y7|2 Seasons|            Kids' TV|
|   s122|TV Show|      Hotel Del Luna|Not Given|Pakistan|  9/2/2021|        2019| TV-14| 1 Season|International TV ...|
|   s133|TV Show|Brave Animated Se...|Not Given|Pakistan|  9/1/2021|        2021| TV-MA| 1 Season|International TV ...|
|   s148|TV Show|  How to Be a Cowboy|Not Given|Pakistan|  9/1/2021|        2021| TV-PG| 1 Season|          Reality TV|
|   s166|TV Show|            Oldsters|Not Given|Pakistan|  9/1/2021|        2019| TV-MA| 1 Season|Crime TV Shows, I...|
|   s190|TV Show|    Bread Barbershop|Not Given|Pakistan| 8/28/2021|        2020|  TV-Y|2 Seasons|Kids' TV, TV Come...|
|   s182|TV Show|Turning Point: 9/...|Not Given|Pakistan|  9/1/2021|        2021| TV-14| 1 Season|          Docuseries|
|   s187|TV Show|Hometown Cha-Cha-Cha|Not Given|Pakistan| 8/29/2021|        2021| TV-14| 1 Season|International TV ...|
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+
only showing top 20 rows

In [88]:
# Using 2 conditions
df.filter((df['country']=="Pakistan") & (df['type']=="TV Show")).show()
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+
|show_id|   type|               title| director| country|date_added|release_year|rating| duration|           listed_in|
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+
|     s4|TV Show|Jailbirds New Orl...|Not Given|Pakistan| 9/24/2021|        2021| TV-MA| 1 Season|Docuseries, Reali...|
|    s15|TV Show|Crime Stories: In...|Not Given|Pakistan| 9/22/2021|        2021| TV-MA| 1 Season|British TV Shows,...|
|    s20|TV Show|              Jaguar|Not Given|Pakistan| 9/22/2021|        2021| TV-MA| 1 Season|International TV ...|
|    s32|TV Show|  Chicago Party Aunt|Not Given|Pakistan| 9/17/2021|        2021| TV-MA| 1 Season|         TV Comedies|
|    s34|TV Show|          Squid Game|Not Given|Pakistan| 9/17/2021|        2021| TV-MA| 1 Season|International TV ...|
|    s35|TV Show|Tayo and Little W...|Not Given|Pakistan| 9/17/2021|        2020| TV-Y7| 1 Season|            Kids' TV|
|    s75|TV Show|The World's Most ...|Not Given|Pakistan| 9/14/2021|        2021| TV-PG|2 Seasons|          Reality TV|
|    s84|TV Show|  Metal Shop Masters|Not Given|Pakistan| 9/10/2021|        2021| TV-MA| 1 Season|          Reality TV|
|    s86|TV Show|Pokémon Master Jo...|Not Given|Pakistan| 9/10/2021|        2021| TV-Y7| 1 Season|Anime Series, Kid...|
|    s88|TV Show|       Titipo Titipo|Not Given|Pakistan| 9/10/2021|        2019|  TV-Y|2 Seasons|Kids' TV, Korean ...|
|    s90|TV Show|         Mighty Raju|Not Given|Pakistan|  9/9/2021|        2017| TV-Y7|4 Seasons|            Kids' TV|
|   s101|TV Show|Tobot Galaxy Dete...|Not Given|Pakistan|  9/7/2021|        2019| TV-Y7|2 Seasons|            Kids' TV|
|   s122|TV Show|      Hotel Del Luna|Not Given|Pakistan|  9/2/2021|        2019| TV-14| 1 Season|International TV ...|
|   s133|TV Show|Brave Animated Se...|Not Given|Pakistan|  9/1/2021|        2021| TV-MA| 1 Season|International TV ...|
|   s148|TV Show|  How to Be a Cowboy|Not Given|Pakistan|  9/1/2021|        2021| TV-PG| 1 Season|          Reality TV|
|   s166|TV Show|            Oldsters|Not Given|Pakistan|  9/1/2021|        2019| TV-MA| 1 Season|Crime TV Shows, I...|
|   s190|TV Show|    Bread Barbershop|Not Given|Pakistan| 8/28/2021|        2020|  TV-Y|2 Seasons|Kids' TV, TV Come...|
|   s182|TV Show|Turning Point: 9/...|Not Given|Pakistan|  9/1/2021|        2021| TV-14| 1 Season|          Docuseries|
|   s187|TV Show|Hometown Cha-Cha-Cha|Not Given|Pakistan| 8/29/2021|        2021| TV-14| 1 Season|International TV ...|
|   s188|TV Show|The Ingenuity of ...|Not Given|Pakistan| 8/29/2021|        2021|  TV-G| 1 Season|International TV ...|
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+
only showing top 20 rows

In [92]:
# Use numeric filter
df.filter(df['release_year']<=2019).show()
+-------+-------+--------------------+--------------------+-------------+----------+------------+------+---------+--------------------+
|show_id|   type|               title|            director|      country|date_added|release_year|rating| duration|           listed_in|
+-------+-------+--------------------+--------------------+-------------+----------+------------+------+---------+--------------------+
|     s8|  Movie|             Sankofa|        Haile Gerima|United States| 9/24/2021|        1993| TV-MA|  125 min|Dramas, Independe...|
|   s939|  Movie|Motu Patlu in the...|         Suhas Kadav|        India|  5/1/2021|        2019| TV-Y7|   87 min|Children & Family...|
|   s940|  Movie|Motu Patlu in Won...|         Suhas Kadav|        India|  5/1/2021|        2013| TV-Y7|   76 min|Children & Family...|
|   s941|  Movie|Motu Patlu: Deep ...|         Suhas Kadav|        India|  5/1/2021|        2014| TV-Y7|   76 min|Children & Family...|
|   s942|  Movie|Motu Patlu: Missi...|         Suhas Kadav|        India|  5/1/2021|        2013| TV-Y7|   71 min|Children & Family...|
|  s3232|  Movie| True: Winter Wishes|Mark Thornton, To...|United States|11/26/2019|        2019|  TV-Y|   46 min|Children & Family...|
|  s4832|TV Show|True: Magical Fri...|Mark Thornton, To...|United States| 6/15/2018|        2018|  TV-Y| 1 Season|            Kids' TV|
|  s4833|TV Show|True: Wonderful W...|Mark Thornton, To...|United States| 6/15/2018|        2018|  TV-Y| 1 Season|            Kids' TV|
|  s4857|TV Show|Dance & Sing with...|Mark Thornton, To...|United States| 5/18/2018|        2018|  TV-Y| 1 Season|            Kids' TV|
|  s7930|  Movie|     Samudri Lootere|    Anirban Majumder|    Not Given| 6/18/2019|        2018|  TV-Y|   65 min|Children & Family...|
|    s25|  Movie|               Jeans|          S. Shankar|        India| 9/21/2021|        1998| TV-14|  166 min|Comedies, Interna...|
|    s28|  Movie|           Grown Ups|        Dennis Dugan|United States| 9/20/2021|        2010| PG-13|  103 min|            Comedies|
|    s29|  Movie|          Dark Skies|       Scott Stewart|United States| 9/19/2021|        2013| PG-13|   97 min|Horror Movies, Sc...|
|    s30|  Movie|            Paranoia|      Robert Luketic|United States| 9/19/2021|        2013| PG-13|  106 min|           Thrillers|
|    s88|TV Show|       Titipo Titipo|           Not Given|     Pakistan| 9/10/2021|        2019|  TV-Y|2 Seasons|Kids' TV, Korean ...|
|    s90|TV Show|         Mighty Raju|           Not Given|     Pakistan|  9/9/2021|        2017| TV-Y7|4 Seasons|            Kids' TV|
|   s101|TV Show|Tobot Galaxy Dete...|           Not Given|     Pakistan|  9/7/2021|        2019| TV-Y7|2 Seasons|            Kids' TV|
|   s122|TV Show|      Hotel Del Luna|           Not Given|     Pakistan|  9/2/2021|        2019| TV-14| 1 Season|International TV ...|
|   s166|TV Show|            Oldsters|           Not Given|     Pakistan|  9/1/2021|        2019| TV-MA| 1 Season|Crime TV Shows, I...|
|    s27|  Movie|      Minsara Kanavu|         Rajiv Menon|        India| 9/21/2021|        1997| TV-PG|  147 min|Comedies, Interna...|
+-------+-------+--------------------+--------------------+-------------+----------+------------+------+---------+--------------------+
only showing top 20 rows

In [93]:
# Filtering and Selecting
df.filter(df['release_year']<=2019).select(['title','country']).show()
+--------------------+-------------+
|               title|      country|
+--------------------+-------------+
|             Sankofa|United States|
|Motu Patlu in the...|        India|
|Motu Patlu in Won...|        India|
|Motu Patlu: Deep ...|        India|
|Motu Patlu: Missi...|        India|
| True: Winter Wishes|United States|
|True: Magical Fri...|United States|
|True: Wonderful W...|United States|
|Dance & Sing with...|United States|
|     Samudri Lootere|    Not Given|
|               Jeans|        India|
|           Grown Ups|United States|
|          Dark Skies|United States|
|            Paranoia|United States|
|       Titipo Titipo|     Pakistan|
|         Mighty Raju|     Pakistan|
|Tobot Galaxy Dete...|     Pakistan|
|      Hotel Del Luna|     Pakistan|
|            Oldsters|     Pakistan|
|      Minsara Kanavu|        India|
+--------------------+-------------+
only showing top 20 rows

In [94]:
# Using the not "~" filter
df.filter(~(df['release_year']==2021)).show()
+-------+-------+--------------------+--------------------+-------------+----------+------------+------+---------+--------------------+
|show_id|   type|               title|            director|      country|date_added|release_year|rating| duration|           listed_in|
+-------+-------+--------------------+--------------------+-------------+----------+------------+------+---------+--------------------+
|     s1|  Movie|Dick Johnson Is Dead|     Kirsten Johnson|United States| 9/25/2021|        2020| PG-13|   90 min|       Documentaries|
|     s8|  Movie|             Sankofa|        Haile Gerima|United States| 9/24/2021|        1993| TV-MA|  125 min|Dramas, Independe...|
|   s939|  Movie|Motu Patlu in the...|         Suhas Kadav|        India|  5/1/2021|        2019| TV-Y7|   87 min|Children & Family...|
|   s940|  Movie|Motu Patlu in Won...|         Suhas Kadav|        India|  5/1/2021|        2013| TV-Y7|   76 min|Children & Family...|
|   s941|  Movie|Motu Patlu: Deep ...|         Suhas Kadav|        India|  5/1/2021|        2014| TV-Y7|   76 min|Children & Family...|
|   s942|  Movie|Motu Patlu: Missi...|         Suhas Kadav|        India|  5/1/2021|        2013| TV-Y7|   71 min|Children & Family...|
|  s3232|  Movie| True: Winter Wishes|Mark Thornton, To...|United States|11/26/2019|        2019|  TV-Y|   46 min|Children & Family...|
|  s4832|TV Show|True: Magical Fri...|Mark Thornton, To...|United States| 6/15/2018|        2018|  TV-Y| 1 Season|            Kids' TV|
|  s4833|TV Show|True: Wonderful W...|Mark Thornton, To...|United States| 6/15/2018|        2018|  TV-Y| 1 Season|            Kids' TV|
|  s4857|TV Show|Dance & Sing with...|Mark Thornton, To...|United States| 5/18/2018|        2018|  TV-Y| 1 Season|            Kids' TV|
|    s17|  Movie|Europe's Most Dan...|Pedro de Echave G...|    Not Given| 9/22/2021|        2020| TV-MA|   67 min|Documentaries, In...|
|  s7930|  Movie|     Samudri Lootere|    Anirban Majumder|    Not Given| 6/18/2019|        2018|  TV-Y|   65 min|Children & Family...|
|    s25|  Movie|               Jeans|          S. Shankar|        India| 9/21/2021|        1998| TV-14|  166 min|Comedies, Interna...|
|    s28|  Movie|           Grown Ups|        Dennis Dugan|United States| 9/20/2021|        2010| PG-13|  103 min|            Comedies|
|    s29|  Movie|          Dark Skies|       Scott Stewart|United States| 9/19/2021|        2013| PG-13|   97 min|Horror Movies, Sc...|
|    s30|  Movie|            Paranoia|      Robert Luketic|United States| 9/19/2021|        2013| PG-13|  106 min|           Thrillers|
|    s35|TV Show|Tayo and Little W...|           Not Given|     Pakistan| 9/17/2021|        2020| TV-Y7| 1 Season|            Kids' TV|
|    s88|TV Show|       Titipo Titipo|           Not Given|     Pakistan| 9/10/2021|        2019|  TV-Y|2 Seasons|Kids' TV, Korean ...|
|    s90|TV Show|         Mighty Raju|           Not Given|     Pakistan|  9/9/2021|        2017| TV-Y7|4 Seasons|            Kids' TV|
|   s101|TV Show|Tobot Galaxy Dete...|           Not Given|     Pakistan|  9/7/2021|        2019| TV-Y7|2 Seasons|            Kids' TV|
+-------+-------+--------------------+--------------------+-------------+----------+------------+------+---------+--------------------+
only showing top 20 rows

In [101]:
# Cast release_year column to numeric
from pyspark.sql.types import DecimalType
df1 = df.withColumn("New_year", df["release_year"].cast(DecimalType(precision=12, scale=2)))
df1.dtypes
Out[101]:
[('show_id', 'string'),
 ('type', 'string'),
 ('title', 'string'),
 ('director', 'string'),
 ('country', 'string'),
 ('date_added', 'string'),
 ('release_year', 'string'),
 ('rating', 'string'),
 ('duration', 'string'),
 ('listed_in', 'string'),
 ('New_year', 'decimal(12,2)')]
In [102]:
# Using Group By and Aggragate functions
# Use Sum
df1.groupBy('type').sum().show()
+-------------+-------------+
|         type|sum(New_year)|
+-------------+-------------+
|      TV Show|   5372296.00|
|        Movie|  12330430.00|
|William Wyler|         null|
+-------------+-------------+

In [103]:
# # Other functions mean(),count(),max(),avg()
df1.groupby('type').mean().show()
+-------------+-------------+
|         type|avg(New_year)|
+-------------+-------------+
|      TV Show|  2016.627628|
|        Movie|  2013.131429|
|William Wyler|         null|
+-------------+-------------+

In [105]:
# Use dictionary to apply directly on a column
df1.agg({'new_year':'sum'}).show()
+-------------+
|sum(new_year)|
+-------------+
|  17702726.00|
+-------------+

In [106]:
# To use Spark SQL, create a temp view
df1.createOrReplaceTempView("df1")
In [107]:
# Now we can use SQL queries like normal
spark.sql("SELECT * FROM df1").show()
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+--------+
|show_id|   type|               title|            director|       country|date_added|release_year|rating| duration|           listed_in|New_year|
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+--------+
|     s1|  Movie|Dick Johnson Is Dead|     Kirsten Johnson| United States| 9/25/2021|        2020| PG-13|   90 min|       Documentaries| 2020.00|
|     s3|TV Show|           Ganglands|     Julien Leclercq|        France| 9/24/2021|        2021| TV-MA| 1 Season|Crime TV Shows, I...| 2021.00|
|     s6|TV Show|       Midnight Mass|       Mike Flanagan| United States| 9/24/2021|        2021| TV-MA| 1 Season|TV Dramas, TV Hor...| 2021.00|
|    s14|  Movie|Confessions of an...|       Bruno Garotti|        Brazil| 9/22/2021|        2021| TV-PG|   91 min|Children & Family...| 2021.00|
|     s8|  Movie|             Sankofa|        Haile Gerima| United States| 9/24/2021|        1993| TV-MA|  125 min|Dramas, Independe...| 1993.00|
|     s9|TV Show|The Great British...|     Andy Devonshire|United Kingdom| 9/24/2021|        2021| TV-14|9 Seasons|British TV Shows,...| 2021.00|
|    s10|  Movie|        The Starling|      Theodore Melfi| United States| 9/24/2021|        2021| PG-13|  104 min|    Comedies, Dramas| 2021.00|
|   s939|  Movie|Motu Patlu in the...|         Suhas Kadav|         India|  5/1/2021|        2019| TV-Y7|   87 min|Children & Family...| 2019.00|
|    s13|  Movie|        Je Suis Karl| Christian Schwochow|       Germany| 9/23/2021|        2021| TV-MA|  127 min|Dramas, Internati...| 2021.00|
|   s940|  Movie|Motu Patlu in Won...|         Suhas Kadav|         India|  5/1/2021|        2013| TV-Y7|   76 min|Children & Family...| 2013.00|
|   s941|  Movie|Motu Patlu: Deep ...|         Suhas Kadav|         India|  5/1/2021|        2014| TV-Y7|   76 min|Children & Family...| 2014.00|
|   s942|  Movie|Motu Patlu: Missi...|         Suhas Kadav|         India|  5/1/2021|        2013| TV-Y7|   71 min|Children & Family...| 2013.00|
|   s852|  Movie|    99 Songs (Tamil)|           Not Given|      Pakistan| 5/21/2021|        2021| TV-14|  131 min|Dramas, Internati...| 2021.00|
|   s471|  Movie|Bridgerton - The ...|Krysia Plonka, Kr...| United States| 7/13/2021|        2021| TV-14|   39 min|              Movies| 2021.00|
|   s730|  Movie|Bling Empire - Th...|Krysia Plonka, Kr...| United States| 6/12/2021|        2021| TV-MA|   36 min|              Movies| 2021.00|
|   s731|  Movie|Cobra Kai - The A...|Krysia Plonka, Kr...| United States| 6/12/2021|        2021| TV-MA|   34 min|              Movies| 2021.00|
|   s913|  Movie|The Circle - The ...|Krysia Plonka, Kr...| United States|  5/7/2021|        2021| TV-14|   35 min|            Comedies| 2021.00|
|     s4|TV Show|Jailbirds New Orl...|           Not Given|      Pakistan| 9/24/2021|        2021| TV-MA| 1 Season|Docuseries, Reali...| 2021.00|
|    s15|TV Show|Crime Stories: In...|           Not Given|      Pakistan| 9/22/2021|        2021| TV-MA| 1 Season|British TV Shows,...| 2021.00|
|  s3232|  Movie| True: Winter Wishes|Mark Thornton, To...| United States|11/26/2019|        2019|  TV-Y|   46 min|Children & Family...| 2019.00|
+-------+-------+--------------------+--------------------+--------------+----------+------------+------+---------+--------------------+--------+
only showing top 20 rows

In [115]:
spark.sql("SELECT * FROM df1 WHERE country='Pakistan'").show()
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+--------+
|show_id|   type|               title| director| country|date_added|release_year|rating| duration|           listed_in|New_year|
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+--------+
|   s852|  Movie|    99 Songs (Tamil)|Not Given|Pakistan| 5/21/2021|        2021| TV-14|  131 min|Dramas, Internati...| 2021.00|
|     s4|TV Show|Jailbirds New Orl...|Not Given|Pakistan| 9/24/2021|        2021| TV-MA| 1 Season|Docuseries, Reali...| 2021.00|
|    s15|TV Show|Crime Stories: In...|Not Given|Pakistan| 9/22/2021|        2021| TV-MA| 1 Season|British TV Shows,...| 2021.00|
|    s20|TV Show|              Jaguar|Not Given|Pakistan| 9/22/2021|        2021| TV-MA| 1 Season|International TV ...| 2021.00|
|    s32|TV Show|  Chicago Party Aunt|Not Given|Pakistan| 9/17/2021|        2021| TV-MA| 1 Season|         TV Comedies| 2021.00|
|    s34|TV Show|          Squid Game|Not Given|Pakistan| 9/17/2021|        2021| TV-MA| 1 Season|International TV ...| 2021.00|
|    s35|TV Show|Tayo and Little W...|Not Given|Pakistan| 9/17/2021|        2020| TV-Y7| 1 Season|            Kids' TV| 2020.00|
|    s75|TV Show|The World's Most ...|Not Given|Pakistan| 9/14/2021|        2021| TV-PG|2 Seasons|          Reality TV| 2021.00|
|    s84|TV Show|  Metal Shop Masters|Not Given|Pakistan| 9/10/2021|        2021| TV-MA| 1 Season|          Reality TV| 2021.00|
|    s86|TV Show|Pokémon Master Jo...|Not Given|Pakistan| 9/10/2021|        2021| TV-Y7| 1 Season|Anime Series, Kid...| 2021.00|
|    s88|TV Show|       Titipo Titipo|Not Given|Pakistan| 9/10/2021|        2019|  TV-Y|2 Seasons|Kids' TV, Korean ...| 2019.00|
|    s90|TV Show|         Mighty Raju|Not Given|Pakistan|  9/9/2021|        2017| TV-Y7|4 Seasons|            Kids' TV| 2017.00|
|   s101|TV Show|Tobot Galaxy Dete...|Not Given|Pakistan|  9/7/2021|        2019| TV-Y7|2 Seasons|            Kids' TV| 2019.00|
|   s122|TV Show|      Hotel Del Luna|Not Given|Pakistan|  9/2/2021|        2019| TV-14| 1 Season|International TV ...| 2019.00|
|   s133|TV Show|Brave Animated Se...|Not Given|Pakistan|  9/1/2021|        2021| TV-MA| 1 Season|International TV ...| 2021.00|
|   s148|TV Show|  How to Be a Cowboy|Not Given|Pakistan|  9/1/2021|        2021| TV-PG| 1 Season|          Reality TV| 2021.00|
|   s166|TV Show|            Oldsters|Not Given|Pakistan|  9/1/2021|        2019| TV-MA| 1 Season|Crime TV Shows, I...| 2019.00|
|   s190|TV Show|    Bread Barbershop|Not Given|Pakistan| 8/28/2021|        2020|  TV-Y|2 Seasons|Kids' TV, TV Come...| 2020.00|
|   s182|TV Show|Turning Point: 9/...|Not Given|Pakistan|  9/1/2021|        2021| TV-14| 1 Season|          Docuseries| 2021.00|
|   s187|TV Show|Hometown Cha-Cha-Cha|Not Given|Pakistan| 8/29/2021|        2021| TV-14| 1 Season|International TV ...| 2021.00|
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+--------+
only showing top 20 rows

In [121]:
# Assign new spark dataframe from sql query
pakistan_data = spark.sql("SELECT * FROM df1 WHERE country='Pakistan'")
pakistan_data.show()
type(pakistan_data)
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+--------+
|show_id|   type|               title| director| country|date_added|release_year|rating| duration|           listed_in|New_year|
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+--------+
|   s852|  Movie|    99 Songs (Tamil)|Not Given|Pakistan| 5/21/2021|        2021| TV-14|  131 min|Dramas, Internati...| 2021.00|
|     s4|TV Show|Jailbirds New Orl...|Not Given|Pakistan| 9/24/2021|        2021| TV-MA| 1 Season|Docuseries, Reali...| 2021.00|
|    s15|TV Show|Crime Stories: In...|Not Given|Pakistan| 9/22/2021|        2021| TV-MA| 1 Season|British TV Shows,...| 2021.00|
|    s20|TV Show|              Jaguar|Not Given|Pakistan| 9/22/2021|        2021| TV-MA| 1 Season|International TV ...| 2021.00|
|    s32|TV Show|  Chicago Party Aunt|Not Given|Pakistan| 9/17/2021|        2021| TV-MA| 1 Season|         TV Comedies| 2021.00|
|    s34|TV Show|          Squid Game|Not Given|Pakistan| 9/17/2021|        2021| TV-MA| 1 Season|International TV ...| 2021.00|
|    s35|TV Show|Tayo and Little W...|Not Given|Pakistan| 9/17/2021|        2020| TV-Y7| 1 Season|            Kids' TV| 2020.00|
|    s75|TV Show|The World's Most ...|Not Given|Pakistan| 9/14/2021|        2021| TV-PG|2 Seasons|          Reality TV| 2021.00|
|    s84|TV Show|  Metal Shop Masters|Not Given|Pakistan| 9/10/2021|        2021| TV-MA| 1 Season|          Reality TV| 2021.00|
|    s86|TV Show|Pokémon Master Jo...|Not Given|Pakistan| 9/10/2021|        2021| TV-Y7| 1 Season|Anime Series, Kid...| 2021.00|
|    s88|TV Show|       Titipo Titipo|Not Given|Pakistan| 9/10/2021|        2019|  TV-Y|2 Seasons|Kids' TV, Korean ...| 2019.00|
|    s90|TV Show|         Mighty Raju|Not Given|Pakistan|  9/9/2021|        2017| TV-Y7|4 Seasons|            Kids' TV| 2017.00|
|   s101|TV Show|Tobot Galaxy Dete...|Not Given|Pakistan|  9/7/2021|        2019| TV-Y7|2 Seasons|            Kids' TV| 2019.00|
|   s122|TV Show|      Hotel Del Luna|Not Given|Pakistan|  9/2/2021|        2019| TV-14| 1 Season|International TV ...| 2019.00|
|   s133|TV Show|Brave Animated Se...|Not Given|Pakistan|  9/1/2021|        2021| TV-MA| 1 Season|International TV ...| 2021.00|
|   s148|TV Show|  How to Be a Cowboy|Not Given|Pakistan|  9/1/2021|        2021| TV-PG| 1 Season|          Reality TV| 2021.00|
|   s166|TV Show|            Oldsters|Not Given|Pakistan|  9/1/2021|        2019| TV-MA| 1 Season|Crime TV Shows, I...| 2019.00|
|   s190|TV Show|    Bread Barbershop|Not Given|Pakistan| 8/28/2021|        2020|  TV-Y|2 Seasons|Kids' TV, TV Come...| 2020.00|
|   s182|TV Show|Turning Point: 9/...|Not Given|Pakistan|  9/1/2021|        2021| TV-14| 1 Season|          Docuseries| 2021.00|
|   s187|TV Show|Hometown Cha-Cha-Cha|Not Given|Pakistan| 8/29/2021|        2021| TV-14| 1 Season|International TV ...| 2021.00|
+-------+-------+--------------------+---------+--------+----------+------------+------+---------+--------------------+--------+
only showing top 20 rows

Out[121]:
pyspark.sql.dataframe.DataFrame